x
#importing the desired modulesimport pandas as pdimport numpy as pyimport matplotlib.pyplot as pltimport plotly.express as pxcereal_df = pd.read_csv(r"C:\Solaris\MY COURSES\DATA ANALYTICS\DATA SETS\Cereal Data\cereal.csv")pd.set_option('display.max_rows',None) #Display all rows in the data framecereal_df.replace(-1,0,inplace = True)cereal_df#Statistical Summary of the datacereal_df.describe(include = 'all')#Creating a graph depicting the various nutritional components of each cerealnutrition_df = cereal_df[['name','protein','fat','sodium','fiber','carbo','sugars','potass','vitamins']]nutrition_df.set_index('name',inplace = True)fig1 = nutrition_df.plot(kind='barh',figsize = (20,500))#Adding value labels in a chartfor i in range(0,8): plt.bar_label(fig1.containers[i],label_type = 'edge',padding = 15,fontsize = 20)#increasing the size of the ticksplt.xticks(fontsize = 20)plt.yticks(fontsize = 20)#adding labelsplt.xlabel('Quantity in grams',fontsize = 20)plt.ylabel('Cereal',fontsize = 20)#Increasing the fontsize of legendplt.legend(fontsize = 20)# Determing the foods with the highest calories.calories_df = cereal_df[['name','calories']]calories_dffig_calories = px.scatter(calories_df, x= "calories",y = "name")fig_calories.update_layout( title = 'CEREALS AND THEIR CALORIE CONTENT', font_family = "Georgia", title_font_family = "Times New Roman", title_font_color = "Blue", title_x =0.5, xaxis_title = "Calories", yaxis_title = "Cereal Name")fig_calories.update_xaxes(title_font_color = "Blue",title_font = {'size':15},title_standoff = 25)fig_calories.update_yaxes(title_font_color = "Blue",title_font = {'size':15},title_standoff = 25)fig_calories.show()# Which manufacturer produces the most popular cereals?#For this we will take the average of the ratings for each manufacturer and create a pie chartpopular_df = cereal_df[['mfr','rating']]group = round(popular_df.groupby('mfr').mean(),2)group.reset_index(inplace = True)figure = px.pie(group, values = 'rating', names = 'mfr', title = "Manufacturers Vs Ratings", hole = 0.3, color_discrete_sequence = px.colors.sequential.Plasma_r)figure.update_layout(title_x = 0.5)figure.update_traces(textinfo = 'label + value')figure.show()#From the pie chart it can be deducted that N manufacturer is the king of the market.# N has the highest rating(average of rating of it's products) than the other manufacturers.# Creating a sunburst chart to easily identify the top-rated and the bottom-rated cereals# The chart also helps in easy visual identification of each manufacturer's popular and least favourite cerealssun_fig = px.sunburst(cereal_df, path = ['mfr','type','name'],color = 'rating',color_continuous_scale = 'RdBu')sun_fig.show()# All of N manufacturer's cereals are rated highly.# All of G manufacturer's cereals belong to the lower end of the rating scale.# While all manufacturers produce cereal of type C,# Manufacturer's N and Q are the only ones producing additional cereals of type H.#Creating a line chart to see if there is any correlation between ratings and calorie content of the cereal boxes.df1 = px.scatter(cereal_df, x ='calories',y = 'rating',color = 'name',trendline = "ols")df1.update_layout( title = 'CALORIE VS RATING', title_font_family = 'Times New Roman', xaxis_title = 'Calories', yaxis_title = 'rating', title_x = 0.4, plot_bgcolor = 'black', yaxis = dict(showgrid = False), xaxis = dict(showgrid = False), legend_title = 'Cereal', paper_bgcolor = 'khaki' )df1.show()#The graph clearly indicates that lower the calories, higher the rating.#Using a trendline to affirm that rating and calories have an inverse relationship#Fewer the calories, higher the rating of the cerealimport seaborn as snssns.regplot(x = 'calories', y='rating', data = cereal_df, scatter_kws = {'color':'orange'})sns.set(rc = {'figure.figsize':(10,5)})plt.ylim(0,)